# Importation du package pandas
import pandas as pd
# Importation de la BD (base de données)
df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
df
| id | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | id1 | 58 | management | married | tertiary | no | 2143.0 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no |
| 1 | id2 | 44 | technician | single | secondary | no | 29.0 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no |
| 2 | id3 | 33 | entrepreneur | married | secondary | no | 2.0 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no |
| 3 | id4 | 47 | blue-collar | married | unknown | no | 1506.0 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no |
| 4 | id5 | 33 | unknown | single | unknown | no | 1.0 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 45206 | id45207 | 51 | technician | married | tertiary | no | 825.0 | no | no | cellular | 17 | nov | 977 | 3 | -1 | 0 | unknown | yes |
| 45207 | id45208 | 71 | retired | divorced | primary | no | 1729.0 | no | no | cellular | 17 | nov | 456 | 2 | -1 | 0 | unknown | yes |
| 45208 | id45209 | 72 | retired | married | secondary | no | 5715.0 | no | no | cellular | 17 | nov | 1127 | 5 | 184 | 3 | success | yes |
| 45209 | id45210 | 57 | blue-collar | married | secondary | no | 668.0 | no | no | telephone | 17 | nov | 508 | 4 | -1 | 0 | unknown | no |
| 45210 | id45211 | 37 | entrepreneur | married | secondary | no | 2971.0 | no | no | cellular | 17 | nov | 361 | 2 | 188 | 11 | other | no |
45211 rows × 18 columns
# Les caractéristiques de la BD
df.head()
| id | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | id1 | 58 | management | married | tertiary | no | 2143.0 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no |
| 1 | id2 | 44 | technician | single | secondary | no | 29.0 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no |
| 2 | id3 | 33 | entrepreneur | married | secondary | no | 2.0 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no |
| 3 | id4 | 47 | blue-collar | married | unknown | no | 1506.0 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no |
| 4 | id5 | 33 | unknown | single | unknown | no | 1.0 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no |
df.tail()
| id | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 45206 | id45207 | 51 | technician | married | tertiary | no | 825.0 | no | no | cellular | 17 | nov | 977 | 3 | -1 | 0 | unknown | yes |
| 45207 | id45208 | 71 | retired | divorced | primary | no | 1729.0 | no | no | cellular | 17 | nov | 456 | 2 | -1 | 0 | unknown | yes |
| 45208 | id45209 | 72 | retired | married | secondary | no | 5715.0 | no | no | cellular | 17 | nov | 1127 | 5 | 184 | 3 | success | yes |
| 45209 | id45210 | 57 | blue-collar | married | secondary | no | 668.0 | no | no | telephone | 17 | nov | 508 | 4 | -1 | 0 | unknown | no |
| 45210 | id45211 | 37 | entrepreneur | married | secondary | no | 2971.0 | no | no | cellular | 17 | nov | 361 | 2 | 188 | 11 | other | no |
df.shape
(45211, 18)
# La nature des variables de la BD
df.dtypes
id object age int64 job object marital object education object default object balance float64 housing object loan object contact object day int64 month object duration int64 campaign int64 pdays int64 previous int64 poutcome object y object dtype: object
# Description rapide de la BD
df.describe()
| age | balance | day | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| count | 45211.000000 | 45206.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 |
| mean | 40.931477 | 1362.403707 | 15.806419 | 258.163080 | 2.763841 | 40.197828 | 0.580323 |
| std | 10.623372 | 3044.906741 | 8.322476 | 257.527812 | 3.098021 | 100.128746 | 2.303441 |
| min | 10.000000 | -8019.000000 | 1.000000 | 0.000000 | 1.000000 | -1.000000 | 0.000000 |
| 25% | 33.000000 | 72.000000 | 8.000000 | 103.000000 | 1.000000 | -1.000000 | 0.000000 |
| 50% | 39.000000 | 448.000000 | 16.000000 | 180.000000 | 2.000000 | -1.000000 | 0.000000 |
| 75% | 48.000000 | 1428.000000 | 21.000000 | 319.000000 | 3.000000 | -1.000000 | 0.000000 |
| max | 95.000000 | 102127.000000 | 31.000000 | 4918.000000 | 63.000000 | 871.000000 | 275.000000 |
# Ajout des variables qualitatives
df.describe(include = "all")
| id | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 45211 | 45211.000000 | 45211 | 45211 | 45211 | 45211 | 45206.000000 | 45211 | 45211 | 45211 | 45211.000000 | 45211 | 45211.000000 | 45211.000000 | 45211.000000 | 45211.000000 | 45211 | 45211 |
| unique | 45211 | NaN | 12 | 3 | 4 | 2 | NaN | 2 | 2 | 3 | NaN | 12 | NaN | NaN | NaN | NaN | 4 | 2 |
| top | id1 | NaN | blue-collar | married | secondary | no | NaN | yes | no | cellular | NaN | may | NaN | NaN | NaN | NaN | unknown | no |
| freq | 1 | NaN | 9732 | 27214 | 23202 | 44396 | NaN | 25130 | 37967 | 29285 | NaN | 13766 | NaN | NaN | NaN | NaN | 36959 | 39922 |
| mean | NaN | 40.931477 | NaN | NaN | NaN | NaN | 1362.403707 | NaN | NaN | NaN | 15.806419 | NaN | 258.163080 | 2.763841 | 40.197828 | 0.580323 | NaN | NaN |
| std | NaN | 10.623372 | NaN | NaN | NaN | NaN | 3044.906741 | NaN | NaN | NaN | 8.322476 | NaN | 257.527812 | 3.098021 | 100.128746 | 2.303441 | NaN | NaN |
| min | NaN | 10.000000 | NaN | NaN | NaN | NaN | -8019.000000 | NaN | NaN | NaN | 1.000000 | NaN | 0.000000 | 1.000000 | -1.000000 | 0.000000 | NaN | NaN |
| 25% | NaN | 33.000000 | NaN | NaN | NaN | NaN | 72.000000 | NaN | NaN | NaN | 8.000000 | NaN | 103.000000 | 1.000000 | -1.000000 | 0.000000 | NaN | NaN |
| 50% | NaN | 39.000000 | NaN | NaN | NaN | NaN | 448.000000 | NaN | NaN | NaN | 16.000000 | NaN | 180.000000 | 2.000000 | -1.000000 | 0.000000 | NaN | NaN |
| 75% | NaN | 48.000000 | NaN | NaN | NaN | NaN | 1428.000000 | NaN | NaN | NaN | 21.000000 | NaN | 319.000000 | 3.000000 | -1.000000 | 0.000000 | NaN | NaN |
| max | NaN | 95.000000 | NaN | NaN | NaN | NaN | 102127.000000 | NaN | NaN | NaN | 31.000000 | NaN | 4918.000000 | 63.000000 | 871.000000 | 275.000000 | NaN | NaN |
# Variables manquantes
df.isnull().mean()
id 0.000000 age 0.000000 job 0.000000 marital 0.000000 education 0.000000 default 0.000000 balance 0.000111 housing 0.000000 loan 0.000000 contact 0.000000 day 0.000000 month 0.000000 duration 0.000000 campaign 0.000000 pdays 0.000000 previous 0.000000 poutcome 0.000000 y 0.000000 dtype: float64
# conserver uniquement les clients majeurs
df_majeur = df[df["age"] > 18]
df_majeur.describe()
| age | balance | day | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| count | 45191.000000 | 45186.000000 | 45191.000000 | 45191.000000 | 45191.000000 | 45191.000000 | 45191.000000 |
| mean | 40.942223 | 1362.894967 | 15.809387 | 258.180501 | 2.764422 | 40.208072 | 0.580447 |
| std | 10.613323 | 3045.476834 | 8.321944 | 257.574852 | 3.098537 | 100.144958 | 2.303865 |
| min | 19.000000 | -8019.000000 | 1.000000 | 0.000000 | 1.000000 | -1.000000 | 0.000000 |
| 25% | 33.000000 | 72.000000 | 8.000000 | 103.000000 | 1.000000 | -1.000000 | 0.000000 |
| 50% | 39.000000 | 449.000000 | 16.000000 | 180.000000 | 2.000000 | -1.000000 | 0.000000 |
| 75% | 48.000000 | 1428.000000 | 21.000000 | 319.000000 | 3.000000 | -1.000000 | 0.000000 |
| max | 95.000000 | 102127.000000 | 31.000000 | 4918.000000 | 63.000000 | 871.000000 | 275.000000 |
# Nombredes colonnes
df.columns
Index(['id', 'age', 'job', 'marital', 'education', 'default', 'balance',
'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
'pdays', 'previous', 'poutcome', 'y'],
dtype='object')
# Variables quantitatives univariées (histogramme, Diagramme en boîte (boxplot, courbe de densité))
# Matplotlib
# Importation de matplotlib
import matplotlib.pyplot as plt
# Histogramme de la variable age
plt.hist(df_majeur["age"])
(array([ 2128., 12871., 10992., 9008., 6072., 3310., 429., 251.,
116., 14.]),
array([19. , 26.6, 34.2, 41.8, 49.4, 57. , 64.6, 72.2, 79.8, 87.4, 95. ]),
<BarContainer object of 10 artists>)
# Personnaliser: Couleur, titre des axes et des graphiques,bins,etc
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.title("Histogramme de l'âge")
plt.xlabel("Age")
plt.ylabel("Effectif")
Text(0, 0.5, 'Effectif')
# Ajout d'une grille
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.title("Histogramme de l'âge")
plt.xlabel("Age")
plt.ylabel("Effectif")
plt.grid()
# Ajouter la taille du graphique
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.title("Histogramme de l'âge")
plt.xlabel("Age")
plt.ylabel("Effectif")
plt.figure(figsize = (5,5))
plt.show() # afficher le graphique
<Figure size 500x500 with 0 Axes>
# Changer les marques de l'axe des X
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.xticks(range(0, 100, 5))
([<matplotlib.axis.XTick at 0x19faaf8c8d0>, <matplotlib.axis.XTick at 0x19faaf7e790>, <matplotlib.axis.XTick at 0x19fab044b90>, <matplotlib.axis.XTick at 0x19faafcca90>, <matplotlib.axis.XTick at 0x19fab094cd0>, <matplotlib.axis.XTick at 0x19fab096d10>, <matplotlib.axis.XTick at 0x19fab09cf10>, <matplotlib.axis.XTick at 0x19fab09e790>, <matplotlib.axis.XTick at 0x19fab09f390>, <matplotlib.axis.XTick at 0x19fab0a5350>, <matplotlib.axis.XTick at 0x19fab0a7350>, <matplotlib.axis.XTick at 0x19fab0b1510>, <matplotlib.axis.XTick at 0x19fab0b3450>, <matplotlib.axis.XTick at 0x19faaf97610>, <matplotlib.axis.XTick at 0x19fab0b5950>, <matplotlib.axis.XTick at 0x19fab0b4a90>, <matplotlib.axis.XTick at 0x19fab0b9b10>, <matplotlib.axis.XTick at 0x19fab0bbb10>, <matplotlib.axis.XTick at 0x19faab0e050>, <matplotlib.axis.XTick at 0x19fab0c23d0>], [Text(0, 0, '0'), Text(5, 0, '5'), Text(10, 0, '10'), Text(15, 0, '15'), Text(20, 0, '20'), Text(25, 0, '25'), Text(30, 0, '30'), Text(35, 0, '35'), Text(40, 0, '40'), Text(45, 0, '45'), Text(50, 0, '50'), Text(55, 0, '55'), Text(60, 0, '60'), Text(65, 0, '65'), Text(70, 0, '70'), Text(75, 0, '75'), Text(80, 0, '80'), Text(85, 0, '85'), Text(90, 0, '90'), Text(95, 0, '95')])
plt.hist(df_majeur["age"], color = "red", bins = 100)
plt.xticks(range(18, 90, 5))
([<matplotlib.axis.XTick at 0x19fab10f890>, <matplotlib.axis.XTick at 0x19fab0d6790>, <matplotlib.axis.XTick at 0x19faad6e610>, <matplotlib.axis.XTick at 0x19fab1fad10>, <matplotlib.axis.XTick at 0x19fab1fc250>, <matplotlib.axis.XTick at 0x19fab1fed10>, <matplotlib.axis.XTick at 0x19fab11c810>, <matplotlib.axis.XTick at 0x19fab205a10>, <matplotlib.axis.XTick at 0x19fab205bd0>, <matplotlib.axis.XTick at 0x19fab2159d0>, <matplotlib.axis.XTick at 0x19fab217950>, <matplotlib.axis.XTick at 0x19fab219c90>, <matplotlib.axis.XTick at 0x19fab219410>, <matplotlib.axis.XTick at 0x19fab2201d0>, <matplotlib.axis.XTick at 0x19fab2222d0>], [Text(18, 0, '18'), Text(23, 0, '23'), Text(28, 0, '28'), Text(33, 0, '33'), Text(38, 0, '38'), Text(43, 0, '43'), Text(48, 0, '48'), Text(53, 0, '53'), Text(58, 0, '58'), Text(63, 0, '63'), Text(68, 0, '68'), Text(73, 0, '73'), Text(78, 0, '78'), Text(83, 0, '83'), Text(88, 0, '88')])
# Enregistrement du graphique
plt.savefig("graphique.png")
<Figure size 640x480 with 0 Axes>
# Boxplot de la variable quantitative age
plt.boxplot(df_majeur["age"])
plt.title("Boxplot de la variable âge")
plt.show()
# Graphique de la variable quantitative avec searborn
# importation de seaborn
import seaborn as sns
# Histogramme durée de la dernière communication en séconde
sns.histplot(df["duration"])
plt.title("Histogramme de la variable durée")
plt.figure(figsize = (5,5))
plt.show()
<Figure size 500x500 with 0 Axes>
# Ajout de la courbe de densité (kde = True)
sns.histplot(df["duration"], kde = True, color = "red")
plt.title("Histogramme de la variable durée")
plt.figure(figsize = (5,5))
plt.show()
<Figure size 500x500 with 0 Axes>
# Boxplot avec seaborn
import seaborn as sns
sns.histplot(df_majeur["duration"], color = "green")
plt.title("Boxplot de la variable durée")
plt.show()
sns.boxplot(df_majeur["duration"])
<Axes: >
sns.seaborn(df_majeur["duration"])
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[8], line 1 ----> 1 sns.seaborn(df_majeur["duration"]) AttributeError: module 'seaborn' has no attribute 'seaborn'
# Courbe de densité
sns.kdeplot((df_majeur["duration"]), color = "green")
plt.title("Densité de probabilité")
Text(0.5, 1.0, 'Densité de probabilité')
# Graphique univarié des variables qualitatives
# Diagramme en barre
# Diagramme en secteurs(camembert)
# Diagramme en barre en terme de nombre
# Détermination des effectifs par modalité
effectif = df_majeur["education"].value_counts().reset_index()
effectif
| education | count | |
|---|---|---|
| 0 | secondary | 23197 |
| 1 | tertiary | 13298 |
| 2 | primary | 6846 |
| 3 | unknown | 1850 |
type(effectif)
pandas.core.frame.DataFrame
couleurs = ["red", "blue", "green", "yellow"]
couleurs
['red', 'blue', 'green', 'yellow']
# Graphique
import matplotlib.pyplot as plt
plt.bar(effectif["education"], effectif["count"], color = couleurs)
plt.xlabel("Niveau d'éducation")
plt.ylabel("effectif")
plt.title("Répartition par niveau d'éducation")
plt.show()
# Graphique avec proportion
effectif["proportion"] = round((100*effectif["count"]) / effectif["count"].sum())
plt.bar(effectif["education"], effectif["proportion"],color=couleurs)
<BarContainer object of 4 artists>
effectif["proportion"] = round((100*effectif["count"]) / effectif["count"].sum())
plt.bar(effectif["education"], effectif["proportion"],color=couleurs)
for i, freq in enumerate(effectif["proportion"]):
plt.text(i, freq, str(freq))
# Réaliser le pieChart
plt.pie(effectif["count"], labels=effectif["education"], autopct="%1.1f%%", colors=couleurs)
plt.show()
#Barplot avec seaborn
sns.barplot(x = "education", y="count", data = effectif)
for i, freq in enumerate(effectif["proportion"]):
plt.text(i, freq, str(freq))
df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
df
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[10], line 1 ----> 1 df = pd.read_csv("donnees_marketing_banque.csv", sep = ";") 2 df NameError: name 'pd' is not defined
# Analyse des variables qualitatives et quantitatives
# Boxplot entre la variable y et l'âge
# Exemple d'une variable
# relation entre le fait de souscrire au dépôt et la durée du prêt
import seaborn as sns
df_majeur = df[df["age"] > 18]
sns.boxplot(x ="y", y ="age", data = df_majeur)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[9], line 6 1 # Analyse des variables qualitatives et quantitatives 2 # Boxplot entre la variable y et l'âge 3 # Exemple d'une variable 4 # relation entre le fait de souscrire au dépôt et la durée du prêt 5 import seaborn as sns ----> 6 df_majeur = df[df["age"] > 18] 7 sns.boxplot(x ="y", y ="age", data = df_majeur) NameError: name 'df' is not defined
import pandas as pd
df = pd.read_csv("donnees_marketing_banque.csv", sep = ";")
df
| id | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | id1 | 58 | management | married | tertiary | no | 2143.0 | yes | no | unknown | 5 | may | 261 | 1 | -1 | 0 | unknown | no |
| 1 | id2 | 44 | technician | single | secondary | no | 29.0 | yes | no | unknown | 5 | may | 151 | 1 | -1 | 0 | unknown | no |
| 2 | id3 | 33 | entrepreneur | married | secondary | no | 2.0 | yes | yes | unknown | 5 | may | 76 | 1 | -1 | 0 | unknown | no |
| 3 | id4 | 47 | blue-collar | married | unknown | no | 1506.0 | yes | no | unknown | 5 | may | 92 | 1 | -1 | 0 | unknown | no |
| 4 | id5 | 33 | unknown | single | unknown | no | 1.0 | no | no | unknown | 5 | may | 198 | 1 | -1 | 0 | unknown | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 45206 | id45207 | 51 | technician | married | tertiary | no | 825.0 | no | no | cellular | 17 | nov | 977 | 3 | -1 | 0 | unknown | yes |
| 45207 | id45208 | 71 | retired | divorced | primary | no | 1729.0 | no | no | cellular | 17 | nov | 456 | 2 | -1 | 0 | unknown | yes |
| 45208 | id45209 | 72 | retired | married | secondary | no | 5715.0 | no | no | cellular | 17 | nov | 1127 | 5 | 184 | 3 | success | yes |
| 45209 | id45210 | 57 | blue-collar | married | secondary | no | 668.0 | no | no | telephone | 17 | nov | 508 | 4 | -1 | 0 | unknown | no |
| 45210 | id45211 | 37 | entrepreneur | married | secondary | no | 2971.0 | no | no | cellular | 17 | nov | 361 | 2 | 188 | 11 | other | no |
45211 rows × 18 columns
import seaborn as sns
df_majeur = df[df["age"] > 18]
sns.boxplot(x="y", y="duration", data = df_majeur)
<Axes: xlabel='y', ylabel='duration'>
sns.boxplot(x="y", y="age", data = df_majeur)
<Axes: xlabel='y', ylabel='age'>
# Violinplot
sns.violinplot(x="y", y="age", data = df_majeur)
<Axes: xlabel='y', ylabel='age'>
# Deux Varaibles qualitatives
# Tableau de contingence entre deux variables qualitatives
# créer un tableau croisé
table_contingence = pd.crosstab(df_majeur["y"], df_majeur["education"])
table_contingence
| education | primary | secondary | tertiary | unknown |
|---|---|---|---|---|
| y | ||||
| no | 6257 | 20747 | 11302 | 1603 |
| yes | 589 | 2450 | 1996 | 247 |
# faire le complot
sns.countplot(x="y", hue = "education", data = df_majeur)
<Axes: xlabel='y', ylabel='count'>
# Proportion option normalize
prop_table = df_majeur.groupby("y")["education"].value_counts(normalize=True).reset_index(name="proportion")
prop_table
| y | education | proportion | |
|---|---|---|---|
| 0 | no | secondary | 0.519858 |
| 1 | no | tertiary | 0.283194 |
| 2 | no | primary | 0.156782 |
| 3 | no | unknown | 0.040166 |
| 4 | yes | secondary | 0.463839 |
| 5 | yes | tertiary | 0.377887 |
| 6 | yes | primary | 0.111511 |
| 7 | yes | unknown | 0.046763 |
# Calculer les proportions pour chaque catégorie de y et éducation
sns.barplot(x="y", y="proportion", hue = "education", data = prop_table)
<Axes: xlabel='y', ylabel='proportion'>
# Deux variables quantitatives
# Scatter plot avec matplotlib entre l'âge et la durée
import matplotlib.pyplot as plt
plt.scatter(df_majeur["duration"], df_majeur["age"], c ="blue")
plt.xlabel("Durée")
plt.ylabel("Age")
Text(0, 0.5, 'Age')
# sns
sns.scatterplot(x = "duration", y = "age", data = df_majeur)
<Axes: xlabel='duration', ylabel='age'>
# Heatmap de corrélation entre les variables quantitatives
var_quantitative = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]
df_quanti = df_majeur[var_quantitative]
df_quanti
| age | balance | day | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| 0 | 58 | 2143.0 | 5 | 261 | 1 | -1 | 0 |
| 1 | 44 | 29.0 | 5 | 151 | 1 | -1 | 0 |
| 2 | 33 | 2.0 | 5 | 76 | 1 | -1 | 0 |
| 3 | 47 | 1506.0 | 5 | 92 | 1 | -1 | 0 |
| 4 | 33 | 1.0 | 5 | 198 | 1 | -1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 45206 | 51 | 825.0 | 17 | 977 | 3 | -1 | 0 |
| 45207 | 71 | 1729.0 | 17 | 456 | 2 | -1 | 0 |
| 45208 | 72 | 5715.0 | 17 | 1127 | 5 | 184 | 3 |
| 45209 | 57 | 668.0 | 17 | 508 | 4 | -1 | 0 |
| 45210 | 37 | 2971.0 | 17 | 361 | 2 | 188 | 11 |
45191 rows × 7 columns
# Matrice de correlation
matrice_correlation = df_quanti.corr()
matrice_correlation
| age | balance | day | duration | campaign | pdays | previous | |
|---|---|---|---|---|---|---|---|
| age | 1.000000 | 0.097703 | -0.009394 | -0.004741 | 0.004580 | -0.023838 | 0.001271 |
| balance | 0.097703 | 1.000000 | 0.004320 | 0.021543 | -0.014680 | 0.003381 | 0.016645 |
| day | -0.009394 | 0.004320 | 1.000000 | -0.030197 | 0.162371 | -0.093157 | -0.051751 |
| duration | -0.004741 | 0.021543 | -0.030197 | 1.000000 | -0.084614 | -0.001565 | 0.001188 |
| campaign | 0.004580 | -0.014680 | 0.162371 | -0.084614 | 1.000000 | -0.088676 | -0.032885 |
| pdays | -0.023838 | 0.003381 | -0.093157 | -0.001565 | -0.088676 | 1.000000 | 0.454803 |
| previous | 0.001271 | 0.016645 | -0.051751 | 0.001188 | -0.032885 | 0.454803 | 1.000000 |
df_majeur.dtypes
id object age int64 job object marital object education object default object balance float64 housing object loan object contact object day int64 month object duration int64 campaign int64 pdays int64 previous int64 poutcome object y object dtype: object
# Graphique
sns.heatmap(matrice_correlation, annot=True, cmap="coolwarm", fmt=".2f")
<Axes: >
# Pairplot pour les liaisons rapides
sns.pairplot(df_majeur, hue="y")
# Plotly : Pour les graphiques dynamiques
import pandas as pd
import numpy as np
import plotly.express as px
# Créer un dataframe avec les données de séries temporelles fictives
np.random.seed(42)
date_rng = pd.date_range(start = "2022-01-01", end = "2022-12-31",freq='D')
data = {'Date':date_rng,
'Valeur':np.random.randn(len(date_rng))}
df = pd.DataFrame(data)
# Créer un graphique de séries temporelles avec plotly express
fig = px.line(df, x='Date', y='Valeur', title = "exemple des séries temporelles")
# Afficher le graphique interactif
fig.show()